Explore called duplications

Author

Claudia Zirión-Martínez

Published

February 13, 2025

Index

Setup

Libraries

Code
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)
library(hexbin)
library(ggExtra)

Paths

Code
metadata_path <-
    "data/processed/metadata_ashton_desj_all_weavepop_H99.csv"
chromosomes_path <-
    "../Crypto_Desjardins_Ashton/results/02.Dataset/chromosomes.csv"
chrom_lengths_path <-
    "data/processed/chromosome_lengths.tsv"
depth_by_chrom_good_desj_path <-
    "../Crypto_Desjardins/results/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
depth_by_chrom_good_ashton_path <-
    "../Crypto_Ashton/results/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
cnv_calls_path <-
    "../Crypto_Desjardins_Ashton/results/02.Dataset/cnv/cnv_calls.tsv"
duplications_out_path <-
    "results/tables/duplications_putative.tsv"
repeats_path_prefix <-
    "../Crypto_Desjardins/results/03.References/"

Prepare dataset

Metadata

Use the metadata table that has all the samples included in the final Crypto_Desjardins_Ashton dataset (n = 1055) and H99.
Select needed columns, remove H99 and get the number of samples per dataset and lineage, per lineage, and total.

Code
metadata <- read.delim(
    metadata_path,
    header=TRUE,
    sep=",",
    stringsAsFactor = TRUE)
metadata <- metadata %>%
    select(sample, strain, source, lineage, dataset, vni_subdivision)%>%
    filter(!strain == "H99") %>%
    group_by(dataset, lineage)%>%
    mutate(samples_in_dataset_lineage = n_distinct(sample))%>%
    ungroup() %>%
    group_by(lineage)%>%
    mutate(samples_in_lineage = n_distinct(sample))%>%
    ungroup()%>%
    mutate(total_samples = n_distinct(sample))%>%
    droplevels()

Chromosome names and length

Get the nice chromosome names from the chromosomes file in WeavePop.

Code
chromosome_names = read.delim(
    chromosomes_path,
    header=TRUE, sep=",")
chromosome_names <- chromosome_names %>%
    mutate(chromosome = str_pad(chromosome, 2, pad = "0"))%>%
    mutate(chromosome = as.factor(chromosome))
levels(chromosome_names$chromosome) <- paste("chr", chromosome_names$chromosome, sep="")

Get the chromosome lengths. Create the file with bash.

Code
# /usr/bin/bash
tail -n +2 ../Crypto_Desjardins/config/chromosomes.csv | \
    cut -d',' -f1 | sort | uniq | while read line 
do
seqkit fx2tab ../Crypto_Desjardins/data/references/$line.fasta -l -i -n >> data/processed/chromosome_lengths.tsv
done
Code
chromosome_lengths = read.delim(
    chrom_lengths_path,
    header=FALSE, 
    col.names=c("accession", "length"), 
    sep="\t")

Called CNVs

Import the file with all called CNVs and add the chromosome names.

Code
cnv_calls <- read.delim(
    cnv_calls_path, 
    header=TRUE, sep="\t")%>%
    left_join(chromosome_names, by="accession")
Code
depth_threshold <- 0.6
del_threshold <- 1 - depth_threshold
dup_threshold <- 1 + depth_threshold

Explore Called CNVs

Code
cnv_summary <- cnv_calls %>%
    group_by(cnv) %>%
    summarize(
        count = n(),
        min_norm_depth = min(norm_depth, na.rm = TRUE),
        mean_norm_depth = mean(norm_depth, na.rm = TRUE),
        q25_norm_depth = quantile(norm_depth, probs = 0.25, na.rm = TRUE),
        median_norm_depth = median(norm_depth, na.rm = TRUE),
        q75_norm_depth = quantile(norm_depth, probs = 0.75, na.rm = TRUE),
        max_norm_depth = max(norm_depth, na.rm = TRUE),
        min_smooth_depth = min(smooth_depth, na.rm = TRUE),
        mean_smooth_depth = mean(smooth_depth, na.rm = TRUE),
        q25_smooth_depth = quantile(smooth_depth, probs = 0.25, na.rm = TRUE),
        median_smooth_depth = median(smooth_depth, na.rm = TRUE),
        q75_smooth_depth = quantile(smooth_depth, probs = 0.75, na.rm = TRUE),
        max_smooth_depth = max(smooth_depth, na.rm = TRUE),
        min_region_size = min(region_size, na.rm = TRUE),
        mean_region_size = mean(region_size, na.rm = TRUE),
        q25_region_size = quantile(region_size, probs = 0.25, na.rm = TRUE),
        median_region_size = median(region_size, na.rm = TRUE),
        q75_region_size = quantile(region_size, probs = 0.75, na.rm = TRUE),
        max_region_size = max(region_size, na.rm = TRUE),
    )
cnv_summary
cnv count min_norm_depth mean_norm_depth q25_norm_depth median_norm_depth q75_norm_depth max_norm_depth min_smooth_depth mean_smooth_depth q25_smooth_depth median_smooth_depth q75_smooth_depth max_smooth_depth min_region_size mean_region_size q25_region_size median_region_size q75_region_size max_region_size
deletion 49641 0 0.0743412 0.00 0.01 0.08 6.82 0.00 0.0973373 0.00 0.04 0.15 0.49 3 14582.99 6500 10000 19994 538500
duplication 9356 0 6.3060400 1.53 1.62 1.94 113.87 1.51 6.1987730 1.53 1.60 1.87 111.11 8 18550.87 3500 8000 14000 948000

There are a lot more deletions than duplications.

Repeats

Percentage of repeats per lineage per chromosome

Get the percentage of each chromosome covered by repeats to know how much of a chromosome might not have reliable CNV calls.

Code
lineages <- unique(metadata$lineage)
repeats_all <- data.frame()
for(lineage in lineages){
    repeats_path <-paste(
        repeats_path_prefix,
        lineage, "/", lineage, "_repeats.bed",
        sep ="")
    repeats <- read.delim(repeats_path, 
        header=FALSE, 
        col.names=c("accession", "start", "end", "repeat_type"), 
        sep="\t")
    repeats$lineage <- lineage
    repeats_all <- rbind(repeats_all, repeats)
}
Code
repeats_percent <- repeats_all %>%
    mutate(repeat_size_each = end - start)%>%
    group_by(accession, lineage) %>%
    summarise(repeat_size = sum(repeat_size_each)) %>%
    left_join(chromosome_lengths, by="accession") %>%
    left_join(chromosome_names, by=c("accession","lineage")) %>%
    mutate(percent_repeat_size = round((repeat_size / length) * 100, 2))%>%
    mutate(chromosome = as.factor(chromosome))%>%
    select(lineage, accession, chromosome, percent_repeat_size)

Most chromosomes have repeats in between 5 and 10% of the size. In VNBII it is closer to 15% for some chormosomes.

Distribution of Repeat Fraction

Code
d <- ggplot(cnv_calls, aes(repeat_fraction, fill = cnv)) +
            facet_wrap(~cnv, ncol = 1, scales = "free")+
            geom_histogram(binwidth = 0.01)+
            scale_x_continuous(labels = scales::comma)+
            theme_minimal() +
            theme(legend.position = "none")+
            labs(title = "Histogram of Repeat Fraction of CNVs",
                subtitle = paste("Binwidth: 0.01"),
                y = "Count",
                x = "Repeat Fraction")
d 

In deletions, more of them have high repetitive fraction. In duplications, most of them have low repetitive fraction.

Depth

Distribution of Normalized Depth and Smooth Normalized Depth

The Normalized depth (norm_depth) is the median of the normalized mean depth of the windows that are part of the CNV region.

The Smooth normalized depth (smooth_depth) is the median of the smooth normalized mean depth of the windows that are part of the CNV region.

Code
d <- ggplot(cnv_calls, aes(norm_depth, fill = cnv)) +
            facet_wrap(~cnv, ncol = 1)+
            geom_histogram(binwidth = 0.1)+
            scale_y_log10(labels = scales::comma) +
            theme_minimal() +
            theme(legend.position = "none")+
            labs(title = "Histogram of Normalized Depth",
            y = "Count (Log 10)",
            x = "Normalized Depth")
s <- ggplot(cnv_calls, aes(smooth_depth, fill = cnv)) +
            facet_wrap(~cnv, ncol = 1)+
            geom_histogram(binwidth = 0.1)+
            scale_y_log10(labels = scales::comma) +
            theme_minimal() +
            theme(legend.position = "none")+
            labs(title = "Histogram of Smooth Normalized Depth",
            y = "Count (Log 10)",
            x = "Smooth Normalized Depth")

d | s

There are CNVs with extremely high normalized depth. I will truncate the normalized depth to 5.

Code
d <- ggplot(cnv_calls, aes(norm_depth, fill = cnv)) +
            facet_wrap(~cnv, ncol = 1)+
            geom_histogram(binwidth = 0.01)+
            geom_vline(xintercept = del_threshold)+
            geom_vline(xintercept = dup_threshold)+
            scale_y_log10(labels = scales::comma) +
            xlim(0,5)+
            theme_minimal() +
            theme(legend.position = "none")+
            labs(title = "Histogram of Normalized Depth",
            subtitle = paste("Lines in depth threshold:", depth_threshold),
            y = "Count (Log 10)",
            x = "Normalized Depth (truncated)")
s <- ggplot(cnv_calls, aes(smooth_depth, fill = cnv)) +
            facet_wrap(~cnv, ncol = 1)+
            geom_histogram(binwidth = 0.01)+
            geom_vline(xintercept = del_threshold)+
            geom_vline(xintercept = dup_threshold)+
            scale_y_log10(labels = scales::comma) +
            xlim(0,5)+
            theme_minimal() +
            theme(legend.position = "none")+
            labs(title = "Histogram of Smooth Normalized Depth",
            subtitle = paste("Lines in depth threshold:", depth_threshold),
            y = "Count (Log 10)",
            x = "Smooth Normalized Depth (truncated)")

d | s

Most deletions have depth (normalized and smooth) close to 0. Most duplications have depth (normalized and smooth) close to the depth threshold with which CNVs were called.

Normalized Depth vs. Smooth Normalized Depth

Code
p <- ggplot(cnv_calls, aes(x = norm_depth, y = smooth_depth)) +
    geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
    geom_hex(bins = 50) +
    facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
    scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
    scale_x_continuous(labels = scales::comma) +
    scale_y_continuous(labels = scales::comma) +
    theme_minimal() +
    labs(title = "Hexbin Plot of Normalized Depth vs Smooth Normalized Depth",
        subtitle = "Panels by interval of Repeat Fraction",
         x = "Normalized Depth",
         y = "Smooth Normalized Depth")

o <- ggplot(cnv_calls, aes(x = norm_depth, y = smooth_depth)) +
    geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
    geom_hex(bins = 50) +
    facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
    scale_x_continuous(labels = scales::comma, limits = c(0,5)) +
    scale_y_continuous(labels = scales::comma, limits = c(0,5)) +
    scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
    theme_minimal() +
    labs(title = "Hexbin Plot of Normalized Depth vs Smooth Normalized Depth (Truncated axes)",
         subtitle = "Panels by interval of Repeat Fraction",
         x = "Normalized Depth",
         y = "Smooth Normalized Depth")
p / o

Since the CNVs are called using the smooth_depth, but it does not always coincide with the norm_depth, there are called CNVs in between the depth threshold used in the CNV calling. Most CNVs are in the diagonal where smooth depth is the same as normalized depth.

There are many duplications with extremely high depth that are in the same high interval of repetitive fraction.

There are more deletion in the high repetitive fraction interval. There are more duplications in the low repetitive fraction interval.

Size

Distribution of Sizes of CNVs

Code
d <- ggplot(cnv_calls, aes(region_size, fill = cnv)) +
            facet_wrap(~cnv, ncol = 1)+
            geom_histogram(binwidth = 1000)+
            scale_x_continuous(labels = scales::comma)+
            scale_y_log10(labels = scales::comma) +
            theme_minimal() +
            theme(legend.position = "none")+
            labs(title = "Histogram of Sizes of CNVs",
                subtitle = paste("Binwidth: 1000\nLength of largest chromosome", scales::comma(max(chromosome_lengths$length))),
                y = "Count (Log10)",
                x = "Size of CNVs")

d 

Only duplication are larger than ~150kb.

Depth vs. Size

Code
p <- ggplot(cnv_calls, aes(x = region_size, y = smooth_depth)) +
    geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
    geom_hex(bins = 50) +
    facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
    scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
    scale_x_continuous(labels = scales::comma) +
    scale_y_continuous(labels = scales::comma) +
    theme_minimal() +
    labs(title = "Hexbin Plot of Smooth Normalized Depth vs CNV Size",
         subtitle = "Panels by interval of Repeat Fraction",
         x = "CNV Size",
         y = "Smooth Normalized Depth")

o <- ggplot(cnv_calls, aes(x = region_size, y = smooth_depth)) +
    geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
    geom_hex(bins = 50) +
    facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
    scale_x_continuous(labels = scales::comma) +
    scale_y_continuous(labels = scales::comma, limits = c(0,5)) +
    scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
    theme_minimal() +
    labs(title = "Hexbin Plot of Smooth Normalized Depth vs CNV Size (Truncated Y axis)",
         subtitle = "Panels by interval of Repeat Fraction",
         x = "CNV Size",
         y = "Smooth Normalized Depth")
p / o

There are many small CNVs in a high repetitive fraction interval that have extremely high depth.

Code
p <- ggplot(cnv_calls, aes(x = region_size, y = norm_depth)) +
    geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
    geom_hex(bins = 50) +
    facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
    scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
    scale_x_continuous(labels = scales::comma) +
    scale_y_continuous(labels = scales::comma) +
    theme_minimal() +
    labs(title = "Hexbin Plot of Normalized Depth vs CNV Size",
        subtitle = "Panels by interval of Repeat Fraction",
         x = "CNV Size",
         y = "Normalized Depth")

o <- ggplot(cnv_calls, aes(x = region_size, y = norm_depth)) +
    geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
    geom_hex(bins = 50) +
    facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
    scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
    scale_x_continuous(labels = scales::comma) +
    scale_y_continuous(labels = scales::comma, limits = c(0,5)) +
    theme_minimal() +
    labs(title = "Hexbin Plot of Normalized Depth vs CNV Size (Truncated Y axis)",
        subtitle = "Panels by interval of Repeat Fraction",
         x = "CNV Size",
         y = "Normalized Depth")
p / o

Mostly, the CNVs with very high smooth_depth are small.

The CNVs whose norm_depth is in between the depth thresholds are small. (These are the ones that the norm_depth and smooth_depth don’t coincide).

Some dupCNVs are large and have very high depth (the ones that are neither in the vertical cluster of small CNVs and high depth or the horizontal cluster of large CNVs and depth around 2):

Code
filter(cnv_calls, region_size > 50000, smooth_depth > 2.5)%>%
  select(sample, lineage, chromosome, norm_depth, smooth_depth, region_size, repeat_fraction)
sample lineage chromosome norm_depth smooth_depth region_size repeat_fraction
SRS885892 VNBI chr04 4.39 4.53 212500 0.03
SRS417676 VNI chr04 2.83 2.84 52500 0.08
SRS417676 VNI chr09 3.18 3.59 191000 0.02
ERS1142739 VNI chr05 3.11 3.25 55000 0.10
ERS2541126 VNI chr13 2.50 2.62 130500 0.12
ERS542397 VNI chr04 2.76 2.77 705000 0.03
ERS2541331 VNI chr04 3.75 3.79 82500 0.01
ERS2541212 VNI chr13 2.66 2.67 130000 0.12
ERS542490 VNI chr04 3.23 3.25 344500 0.06
ERS2541064 VNI chr06 5.10 5.18 65500 0.03

The CNVs with the highest Normalized Depth (smooth or not), are highly repetitive.

There are many delCNVs completely repetitive. The majority of the dupCNVs are not repetitive.

Explore dupCNVs

Keep only the CNVs labeled as duplications (dupCNVs)

Code
dup_calls <- filter(cnv_calls, cnv == "duplication")

Filter out high depth dupCNVs

To filter out the dupCNVs that have very large Normalized Depth. Here I make groups of CNVs that start at the same position in the same chromosome.

Code
dup_summary <- dup_calls %>%
    group_by(lineage, chromosome, start, end, region_size) %>%
    summarize(max_depth = max(norm_depth),
              median_depth = median(norm_depth),
              n = n(),
              median_repeat = median(repeat_fraction))

The dupCNVs groups with max depth larger than 5:

Code
summary_large_dups <- dup_summary %>%
                        filter((max_depth > 5))%>%
                        arrange(desc(n))
head(summary_large_dups)
lineage chromosome start end region_size max_depth median_depth n median_repeat
VNI chr01 339501 351000 11500 9.63 6.090 495 0.00
VNI chr02 274001 282000 8000 113.87 50.300 440 0.65
VNI chr02 274501 281500 7000 100.82 45.080 335 0.60
VNI chr10 582001 587500 5500 6.47 4.635 150 0.47
VNI chr02 274501 282000 7500 74.77 54.430 57 0.63
VNI chr01 1 3000 3000 36.98 8.070 25 0.10

Filter out duplications that are part of the groups in the previous table.
This way of filtering removes individual CNVs that not necessarily have depth grater than 5 but other samples in the lineage have a high depth CNV in the same region.

Code
dup_calls_filtered <- dup_calls %>%
    filter(!paste(start, end, chromosome, lineage) %in% 
           paste(summary_large_dups$start, summary_large_dups$end, summary_large_dups$chromosome, summary_large_dups$lineage))

Depth vs. Repeats per Chromosome

Code
n <- ggplot(dup_calls_filtered, aes(x = chromosome, y = norm_depth, color = chromosome)) +
       geom_quasirandom()+
       facet_wrap(~cut(repeat_fraction, breaks = 4), ncol = 1) +
        theme_minimal() +
        ylim(0,5)+
        theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
        labs(title = "Distribution of Normalized Depth of dupCNVs",
        subtitle = "Panels by interval of Repeat Fraction",
        y = "Normalized Depth",
        x = "Chromosome")
s <- ggplot(dup_calls_filtered, aes(x = chromosome, y = smooth_depth, color = chromosome)) +
       geom_quasirandom()+
       facet_wrap(~cut(repeat_fraction, breaks = 4), ncol = 1) +
        theme_minimal() +
        ylim(0,5)+
        theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
        labs(title = "Distribution of Smooth Normalized Depth of dupCNVs",
        subtitle = "Panels by interval of Repeat Fraction",
        y = "Smooth Normalized Depth",
        x = "Chromosome")
n | s

Filter out high repeat content CNVs

Code
repeat_threshold = 0.25
Code
summary_repetitive_dups <- dup_summary %>%
                            filter((median_repeat > 0.25))%>%
                            arrange(desc(n))
Code
dup_calls_filtered <- dup_calls_filtered %>%
    filter(!paste(start, end, chromosome, lineage) %in% 
           paste(summary_repetitive_dups$start, summary_repetitive_dups$end, summary_repetitive_dups$chromosome, summary_repetitive_dups$lineage))
Code
n <- ggplot(dup_calls_filtered, aes(x = chromosome, y = norm_depth, color = chromosome)) +
       geom_quasirandom()+
        theme_minimal() +
        ylim(0,5)+
        theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
        labs(title = "Distribution of Normalized Depth of dupCNVs",
        y = "Normalized Depth",
        x = "Chromosome")
s <- ggplot(dup_calls_filtered, aes(x = chromosome, y = smooth_depth, color = chromosome)) +
       geom_quasirandom()+
        theme_minimal() +
        ylim(0,5)+
        theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
        labs(title = "Distribution of Smooth Normalized Depth of dupCNVs",
        y = "Smooth Normalized Depth",
        x = "Chromosome")
n | s

Explore depth of chromosomes

The Normalized median depth of chromosomes (norm_chrom_median) is the normalized median of the depth of the positions in the whole chromosome. (First the median was calculated and then normalized). The Normalized mean depth of chromosomes (norm_chrom_mean) is the normalized mean of the depth of the positions in the whole chromosome. (First the mean was calculated and then normalized).

Code
depth_by_chrom_good_desjardins <- read.delim(
    depth_by_chrom_good_desj_path,
     header=TRUE, sep="\t")
depth_by_chrom_good_ashton <- read.delim(
    depth_by_chrom_good_ashton_path,
     header=TRUE, sep="\t")
depth_by_chrom_good <- rbind(depth_by_chrom_good_desjardins, depth_by_chrom_good_ashton)
depth_by_chrom <- depth_by_chrom_good %>%
    select(sample, accession, norm_chrom_median, norm_chrom_mean)
head(depth_by_chrom)
sample accession norm_chrom_median norm_chrom_mean
SRS404518 CP003820.1 0.98 0.97
SRS404518 CP003821.1 0.98 1.12
SRS404518 CP003822.1 0.99 1.01
SRS404518 CP003823.1 1.00 0.99
SRS404518 CP003824.1 0.99 0.98
SRS404518 CP003825.1 1.00 0.99
Code
med <- ggplot(depth_by_chrom)+
        geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
        scale_x_continuous(breaks = seq(0,max(depth_by_chrom$norm_chrom_median), by = 0.1)) +
        theme_minimal() +
        labs(title = "Histogram of Normalized median depth of chromosomes",
                y = "Count",
                x = "Normalized median depth of chromosomes")
mea <- ggplot(depth_by_chrom)+
        geom_histogram(aes(norm_chrom_mean), binwidth = 0.01)+
        scale_x_continuous(breaks = seq(0,max(depth_by_chrom$norm_chrom_median), by = 0.1)) +
        theme_minimal() +
        labs(title = "Histogram of Normalized mean depth of chromosomes",
                y = "Count",
                x = "Normalized mean depth of chromosomes")

med / mea

Code
med <- ggplot(filter(depth_by_chrom, norm_chrom_median > 1.2 ))+
        geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
        scale_x_continuous(breaks = seq(0,max(depth_by_chrom$norm_chrom_median), by = 0.1)) +
        scale_y_continuous(
           breaks = seq(0, max(table(round(depth_by_chrom$norm_chrom_median ))) , 1)
         )+
        theme_minimal() +
        theme(panel.grid.minor = element_blank())+
        labs(title = "Histogram of Normalized median depth of chromosomes",
                subtitle = "Only values above 1.2",
                y = "Count",
                x = "Normalized median depth of chromosomes")
mea <- ggplot(filter(depth_by_chrom, norm_chrom_mean > 1.2 ))+
        geom_histogram(aes(norm_chrom_mean), binwidth = 0.01)+
        scale_x_continuous(breaks = seq(0,max(depth_by_chrom$norm_chrom_mean), by = 0.1)) +
        #scale_y_continuous(breaks = seq(0, max(table(round(depth_by_chrom$norm_chrom_mean ))) , 1))+
        theme_minimal() +
        theme(panel.grid.minor = element_blank())+
        labs(title = "Histogram of Normalized mean depth of chromosomes",
                subtitle = "Only values above 1.2",
                y = "Count",
                x = "Normalized mean depth of chromosomes")
med / mea

Chromosomes with very high median depth:

Code
left_join(depth_by_chrom, chromosome_names, by = "accession")%>%
  left_join(metadata, by = c("sample", "lineage"))%>%
  select(lineage, sample, strain, chromosome, norm_chrom_median, norm_chrom_mean)%>%
  filter(norm_chrom_median > 2.2)%>%
  arrange(norm_chrom_median)
lineage sample strain chromosome norm_chrom_median norm_chrom_mean
VNI ERS1142697 20427_2#6 chr12 2.24 2.20
VNI ERS2541126 BMD3144 chr13 2.24 2.23
VNBI SRS885841 NRHc5009.REL.INI chr14 2.25 2.23
VNI SRS404481 Bt139 chr13 2.27 2.20
VNI SRS885916 PMHc1031A.ENR.INI.LP chr12 2.31 2.34
VNBII SRS881180 PMHc1029.ENR.STOR chr12 2.45 2.42
VNI ERS2541212 04CN-03-039 chr13 2.45 2.39
VNI ERS542397 14936_1#6 chr04 2.74 2.34

Metrics per chromosome (dupCNVs and chromosome depth)

The dataframe all_metrics contains a row per individual called duplication with the information about the CNV and the chromosome of the sample. For the chromosome-sample without any called duplication, the columns related to CNVs have NAs.

Code
all_metrics <- left_join(depth_by_chrom, dup_calls_filtered,
        by = c("sample", "accession"))%>%
    select(-chromosome, -lineage)%>%
    left_join(chromosome_lengths, by = "accession") %>%
    left_join(chromosome_names,by ="accession") %>%
    left_join(metadata, by = c("sample", "lineage"))
head(all_metrics)
sample accession norm_chrom_median norm_chrom_mean start end cnv region_size depth norm_depth smooth_depth repeat_fraction overlap_bp feature_id length lineage chromosome strain source dataset vni_subdivision samples_in_dataset_lineage samples_in_lineage total_samples
SRS404518 CP003820.1 0.98 0.97 NA NA NA NA NA NA NA NA NA NA 2291499 VNI chr01 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003821.1 0.98 1.12 NA NA NA NA NA NA NA NA NA NA 1621675 VNI chr02 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003822.1 0.99 1.01 NA NA NA NA NA NA NA NA NA NA 1575141 VNI chr03 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003823.1 1.00 0.99 NA NA NA NA NA NA NA NA NA NA 1084805 VNI chr04 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003824.1 0.99 0.98 NA NA NA NA NA NA NA NA NA NA 1814975 VNI chr05 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003825.1 1.00 0.99 NA NA NA NA NA NA NA NA NA NA 1422463 VNI chr06 Bt134 Clinical Desjardins VNIa-5 185 853 1055

Summarize the metrics of the called CNVs per chromosome

And keep the rest of the chromosome metrics.

Code
chrom_metrics <- all_metrics %>%
    group_by(sample, strain, chromosome, norm_chrom_mean, norm_chrom_median, length,
            accession,  source, lineage,
            samples_in_lineage, samples_in_dataset_lineage, 
            total_samples,dataset) %>%
    summarise(total_cnv_size = sum(region_size),
                n_cnvs = sum(!is.na(cnv)), #fix this
                first = min(start),
                last = max(end),
                mean_cnv_depth = round(mean(norm_depth),2),
                median_cnv_depth = round(median(norm_depth),2),
                repeat_size = sum(overlap_bp)) %>%
            ungroup()%>%
    mutate(dup_coverage_percent = round((total_cnv_size / length) * 100, 2),
            dup_span_size = last - first,
            dup_span_percent = round((dup_span_size / length) * 100, 2),
            dup_repeat_percent = round((repeat_size / length) * 100, 2),
            chromosome = as.factor(chromosome),
            dup_coverage_percent = ifelse(is.na(dup_coverage_percent), 0, dup_coverage_percent),
            dup_span_percent = ifelse(is.na(dup_span_percent), 0, dup_span_percent))
head(chrom_metrics)
sample strain chromosome norm_chrom_mean norm_chrom_median length accession source lineage samples_in_lineage samples_in_dataset_lineage total_samples dataset total_cnv_size n_cnvs first last mean_cnv_depth median_cnv_depth repeat_size dup_coverage_percent dup_span_size dup_span_percent dup_repeat_percent
ERS1142690 20949_2#1 chr01 1.00 0.98 2291499 CP003820.1 Clinical VNI 853 668 1055 Ashton NA 0 NA NA NA NA NA 0 NA 0 NA
ERS1142690 20949_2#1 chr02 1.20 0.96 1621675 CP003821.1 Clinical VNI 853 668 1055 Ashton NA 0 NA NA NA NA NA 0 NA 0 NA
ERS1142690 20949_2#1 chr03 0.99 0.98 1575141 CP003822.1 Clinical VNI 853 668 1055 Ashton NA 0 NA NA NA NA NA 0 NA 0 NA
ERS1142690 20949_2#1 chr04 1.00 1.00 1084805 CP003823.1 Clinical VNI 853 668 1055 Ashton NA 0 NA NA NA NA NA 0 NA 0 NA
ERS1142690 20949_2#1 chr05 0.97 0.98 1814975 CP003824.1 Clinical VNI 853 668 1055 Ashton NA 0 NA NA NA NA NA 0 NA 0 NA
ERS1142690 20949_2#1 chr06 1.00 1.00 1422463 CP003825.1 Clinical VNI 853 668 1055 Ashton NA 0 NA NA NA NA NA 0 NA 0 NA
The total number of chromosomes in all samples in the dataset is: 14770
The total number of chromosomes in all samples with called duplications is: 2384

Number of dupCNVs

Code
ggplot(filter(chrom_metrics, n_cnvs > 0))+
    geom_histogram(aes(n_cnvs), binwidth = 1)+
    scale_x_continuous() +
    theme_minimal() +
    labs(title = "Histogram of number of dupCNVs per chromosome",
            y = "Count",
            x = "Number of dupCNVs")

Percentage of chromosome Covered by dupCNVs

Code
p <- ggplot(filter(chrom_metrics, n_cnvs > 0))+
        geom_quasirandom(aes(x = chromosome, y = dup_coverage_percent))+
        theme_minimal() +
        labs(title = "Percent of Chromosome Covered by dupCNVs per Chromosome",
                subtitle = "Only chromosomes with at least 1 dupCNV",
                y = "Percent of Chromosome Covered by dupCNVs",
                x = "Chromosome")
ggMarginal(p, type = "histogram")

Code
p <- ggplot(filter(chrom_metrics, dup_coverage_percent > 10))+
        geom_quasirandom(aes(x = chromosome, y = dup_coverage_percent))+
        theme_minimal() +
        labs(title = "Percent of Chromosome Covered by dupCNVs per Chromosome",
                subtitle = "Only chromosomes with at least 10% covered",
                x = "Chromosome",
                y = "Percent of Chromosome Covered by dupCNVs")
ggMarginal(p, type = "histogram")

### Percentage of chromosome Spanned by dupCNVs
p <- ggplot(filter(chrom_metrics, n_cnvs > 0))+
        geom_quasirandom(aes(x = chromosome, y = dup_span_percent))+
        theme_minimal() +
        labs(title = "Percent of Chromosome Spanned by dupCNVs per Chromosome",
                subtitle = "Only chromosomes with at least 1 dupCNV",
                y = "Percent of Chromosome Spanned by dupCNVs",
                x = "Chromosome")
ggMarginal(p, type = "histogram")

Code
p <- ggplot(filter(chrom_metrics, dup_coverage_percent > 10))+
        geom_quasirandom(aes(x = chromosome, y = dup_span_percent))+
        theme_minimal() +
        labs(title = "Percent of Chromosome Spanned by dupCNVs per Chromosome",
                subtitle = "Only chromosomes with at least 10% spanned",
                y = "Percent of Chromosome Spanned by dupCNVs",
                x = "Chromosome")
ggMarginal(p, type = "histogram")

Explore relationship between CNV chromosomal metrics

Code
ggplot(filter(chrom_metrics, n_cnvs > 0), aes(x = dup_coverage_percent, y = dup_span_percent)) +
        geom_hex(bins=50) +
        scale_x_continuous(labels = scales::comma) +
        geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
        scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Spanned by dupCNVs vs Covered by dupCNVs",
            y = "Percent of Chromosome Spanned by dupCNVs",
            x = "Percent of Chromosome Covered by dupCNVs")

Code
ggplot(filter(chrom_metrics, n_cnvs > 0), aes(x = dup_coverage_percent, y = n_cnvs)) +
        geom_hex(bins=50) +
        scale_x_continuous(labels = scales::comma) +
        scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Number of CNVs vs Percent of Chromosome Covered by dupCNVs ",
            y = "Number of CNVs",
            x = "Percent of Chromosome Covered by dupCNVs")

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
        geom_point(aes(color = n_cnvs)) +
        scale_x_continuous(labels = scales::comma) +
        scale_color_distiller(palette = "GnBu", direction = -1, trans = "log2", name = "Number\n of CNVs") +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
            y = "Percent of Chromosome Spanned by CNVs",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
        geom_point(aes(color = norm_chrom_median)) +
        scale_x_continuous(labels = scales::comma) +
        scale_color_viridis_c(name = "Normalized Median\nDepth of Chromosome", direction = -1) +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
            y = "Percent of Chromosome Spanned by CNVs",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
        geom_point(aes(color = norm_chrom_mean)) +
        facet_wrap(~cut(norm_chrom_median, breaks = 6), nrow = 2) +
        scale_x_continuous(labels = scales::comma) +
        scale_color_viridis_c(name = "Normalized Mean\nDepth of Chromosome", direction = -1) +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
            y = "Percent of Chromosome Spanned by CNVs",
            x = "Percent of Chromosome Covered by CNVs")

Compare metrics of dupCNVs with chromosome depth

The Normalized depth (norm_depth) is the median of the normalized depth of the windows that are part of each dupCNV region.

The Median depth of dupCNV (median_cnv_depth) is the median of the Normalized depth of all dupCNVs in the chromosome. There are values bellow the threshold of depth to call a Duplication CNV because that threshold was applied to the smoothed values and this are the raw values.

The Normalized median depth of chromosomes (norm_chrom_median) is the normalized median of the depth of the positions in the whole chromosome. (First the median was calculated and then normalized).

The Percent of Chromosome Covered by dupCNVs (dup_coverage_percent) is the percentage of the full length of the chromosome that is part of called dupCNVs.

The Percent of Chromosome Spanned by dupCNVs (dup_span_percent) is the percentage of the full length of the chromosome that is in between the leftmost and rightmost dupCNVs.

Code
color_range <- paste("2.6 -", max(chrom_metrics$median_cnv_depth, na.rm = TRUE))
colors <- c("#FDE725", "#1F9E89", "#440154")
names(colors) <-c("0 - 1.6", "1.6 - 2.6", color_range)
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_point(aes(color = cut(median_cnv_depth, 
                            breaks = c(-Inf, 1.6, 2.6, Inf), 
                            labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
                            alpha = 0.5) +
        scale_color_manual(values = colors, 
                            name = "Median dupCNV Depth") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by dupCNVs vs. Depth of Chromosome",
             y = "Normalized Median Depth of Chromosome",
             x = "Percent of Chromosome Covered by dupCNVs")

Code
color_range <- paste("2.6 -", max(chrom_metrics$mean_cnv_depth, na.rm = TRUE))
colors <- c("#FDE725", "#1F9E89", "#440154")
names(colors) <-c("0 - 1.6", "1.6 - 2.6", color_range)
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_mean)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_point(aes(color = cut(mean_cnv_depth, 
                            breaks = c(-Inf, 1.6, 2.6, Inf), 
                            labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
                            alpha = 0.5) +
        scale_color_manual(values = colors, 
                            name = "Mean dupCNV Depth") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by dupCNVs vs. Depth of Chromosome",
             y = "Normalized Mean Depth of Chromosome",
             x = "Percent of Chromosome Covered by dupCNVs")

Code
color_range <- paste("2.6 -", max(chrom_metrics$mean_cnv_depth, na.rm = TRUE))
colors <- c("#FDE725", "#1F9E89", "#440154")
names(colors) <-c("0 - 1.6", "1.6 - 2.6", color_range)
ggplot(chrom_metrics, aes(x = norm_chrom_median, y = norm_chrom_mean)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_point(aes(color = cut(mean_cnv_depth, 
                            breaks = c(-Inf, 1.6, 2.6, Inf), 
                            labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
                            alpha = 0.5) +
        scale_color_manual(values = colors, 
                            name = "Mean dupCNV Depth") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Normalized Mean Depth vs. Normalized Median Depth of Chromosome",
             y = "Normalized Mean Depth of Chromosome",
             x = "Normalized Median Depth of Chromosome")

Explore putative duplicated chromosomes

Filter by percent of chromosome in dupCNV or chromosome depth

Code
size_threshold <- 50
depth_threshold <- 1.55
Code
chrom_metrics_filtered <- chrom_metrics %>%
    filter(dup_coverage_percent >= size_threshold | norm_chrom_median >= 1.55)
Code
ggplot(chrom_metrics_filtered, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_point(aes(color = cut(median_cnv_depth, 
                            breaks = c(-Inf, 1.6, 2.6, Inf), 
                            labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
                            alpha = 0.5) +
        scale_color_manual(values = colors, 
                            name = "Median dupCNV Depth") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by dupCNVs vs. Depth of Chromosome",
             y = "Normalized Median Depth of Chromosome",
             x = "Percent of Chromosome Covered by dupCNVs")

Code
write_tsv(chrom_metrics_filtered, duplications_out_path)

Explore sample with very large Median Depth of Chromosome

Code
chrom_metrics_filtered %>%
    filter(norm_chrom_median > 2.5)%>%
    select(sample, strain, chromosome, norm_chrom_median, dup_coverage_percent)
sample strain chromosome norm_chrom_median dup_coverage_percent
ERS542397 14936_1#6 chr04 2.74 64.99